knitr::opts_chunk$set(echo = TRUE)

Overview

This RMarkdown shows how to read in the final project data. It also shows how to calculate the logit-transformed response and setup the binary outcome for use with caret or tidymodels. It also demonstrates how to fit a simple model (with lm()), save that model, and load it back into the workspace. You may find these actions helpful as you work through the project.

You must download the data from Canvas and save the data in the same directory as this RMarkdown file.

Load packages

This example uses the tidyverse suite of packages.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cowplot)
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(ggplot2)

Read data

Please download the final project data from Canvas. If this Rmarkdown file is located in the same directory as the downloaded CSV file, it will be able to load in the data for you. It is highly recommended that you use an RStudio RProject to easily manage the working directory and file paths of the code and objects associated with the final project.

The code chunk below reads in the final project data.

df <- readr::read_csv("paint_project_train_data.csv", col_names = TRUE)
## Rows: 835 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Lightness, Saturation
## dbl (6): R, G, B, Hue, response, outcome
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

The readr::read_csv() function displays the data types and column names associated with the data. However, a glimpse is shown below that reveals the number of rows and also shows some of the representative values for the columns.

df %>% glimpse()
## Rows: 835
## Columns: 8
## $ R          <dbl> 172, 26, 172, 28, 170, 175, 90, 194, 171, 122, 0, 88, 144, …
## $ G          <dbl> 58, 88, 94, 87, 66, 89, 78, 106, 68, 151, 121, 140, 82, 163…
## $ B          <dbl> 62, 151, 58, 152, 58, 65, 136, 53, 107, 59, 88, 58, 132, 50…
## $ Lightness  <chr> "dark", "dark", "dark", "dark", "dark", "dark", "dark", "da…
## $ Saturation <chr> "bright", "bright", "bright", "bright", "bright", "bright",…
## $ Hue        <dbl> 4, 31, 8, 32, 5, 6, 34, 10, 1, 21, 24, 22, 36, 16, 26, 12, …
## $ response   <dbl> 12, 10, 16, 10, 11, 16, 10, 19, 14, 25, 14, 19, 14, 38, 15,…
## $ outcome    <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…

Assign numeric and non-numeric fields into variables for reference later.

sel_num <- df %>% select_if(is_double) %>% select(sort(names(.))) %>% colnames()
sel_cat <- df %>% select(!one_of(sel_num)) %>% colnames()

Performing some required data transformations for the response and outcome fields to facilitate processing.

df_eda <- df %>% mutate(
  log_response = log(response),
  binary_outcome = ifelse(outcome=="event", 1, 0))

Categorical Variables

Data Summary

Get a summary of the categorical variables Lightness and Saturation, as well as the binary response. Check missing values, unique values.

summarize_cate_var <- function(df) {
  tibble(
    variable = names(df),
    n_missing = map_dbl(df, ~sum(is.na(.))),
    n_levels = map_dbl(df, n_distinct),
  ) %>% mutate(
    percent_missing = n_missing/nrow(df)*100,
  )
}

summ_cate <- df_eda %>% select(all_of(sel_cat)) %>% summarize_cate_var()
summ_cate %>% knitr::kable(caption = "categorical variables overview")
categorical variables overview
variable n_missing n_levels percent_missing
Lightness 0 7 0
Saturation 0 7 0

Data Distribution

Count the unique levels for each category and sort by count for each category.

df_eda %>% select(sel_cat) %>% pivot_longer(everything(), names_to ="var", values_to="level") %>%
  count(var,level) %>% arrange(var,n) %>%
  knitr::kable(caption = "categorical variables count")
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(sel_cat)
## 
##   # Now:
##   data %>% select(all_of(sel_cat))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
categorical variables count
var level n
Lightness dark 117
Lightness deep 119
Lightness midtone 119
Lightness saturated 119
Lightness light 120
Lightness soft 120
Lightness pale 121
Saturation gray 83
Saturation neutral 122
Saturation bright 126
Saturation muted 126
Saturation pure 126
Saturation shaded 126
Saturation subdued 126

Use barchart to visualize the distribution of categorical input and binary response variables.

df_eda %>% select(sel_cat) %>%
  pivot_longer(everything()) %>%
  ggplot() +
  geom_bar(aes(x=value, fill=value)) +
  facet_grid(~name, scales="free_x", space="free_x") +
  xlab("") + theme(legend.position="none")

Continuous Variables

Data Summary

Get a high level summary of the numeric variables. Check missing values, unique values, range and median.

summarize_nume_var <- function(df) {
  tibble(
    variable = names(df),
    n_missing = map_dbl(df, ~sum(is.na(.))),
    n_unique = map_dbl(df, n_distinct),
    min = map_dbl(df, min),
    median = map_dbl(df, median),
    max = map_dbl(df, max)
  ) %>% mutate(
    percent_missing = n_missing/nrow(df)*100,
    percent_unique = n_unique/nrow(df)*100,
  )
}

summ_nume <- summarize_nume_var(df %>% select(all_of(sel_num)))
summ_nume %>% knitr::kable(caption = "numeric variables overview")
numeric variables overview
variable n_missing n_unique min median max percent_missing percent_unique
B 0 183 47 170 238 0 21.916168
G 0 171 58 188 241 0 20.479042
Hue 0 36 1 17 36 0 4.311377
outcome 0 2 0 0 1 0 0.239521
R 0 197 0 203 255 0 23.592814
response 0 83 6 51 87 0 9.940120

Examine the distribution of the continuous response.

ggplot(df_eda, aes(x = R, fill = "R")) +
  geom_histogram(position = "identity", alpha = 0.7, bins = 30) +
  geom_histogram(aes(x = G, fill = "G"), position = "identity", alpha = 0.7, bins = 30) +
  geom_histogram(aes(x = B, fill = "B"), position = "identity", alpha = 0.7, bins = 30) +
  geom_histogram(aes(x = Hue, fill = "Hue"), position = "identity", alpha = 0.7, bins = 30) +
  labs(title = "Combined Histogram for R, G, B, Hue",
       x = "Values",
       y = "Frequency") +
  scale_fill_manual(values = c("R" = "blue", "G" = "green", "B" = "red", "Hue" = "purple")) +
  theme_minimal()

The distribution of continuous variables seems gaussian.

Conditioned on Categorical

#Condition (group) the continuous variables based on the categorical variables.

# Boxplot for R grouped by Lightness and Saturation
ggplot(df, aes(x = Lightness, y = R, fill = Saturation)) +
  geom_boxplot() +
  labs(title = "Boxplot of R by Lightness and Saturation")

Boxplot for G grouped by Lightness and Saturation

ggplot(df, aes(x = Lightness, y = G, fill = Saturation)) +
  geom_boxplot() +
  labs(title = "Boxplot of G by Lightness and Saturation")

Boxplot for B grouped by Lightness and Saturation

ggplot(df, aes(x = Lightness, y = B, fill = Saturation)) +
  geom_boxplot() +
  labs(title = "Boxplot of B by Lightness and Saturation")

Boxplot for hue grouped by Lightness and Saturation

ggplot(df, aes(x = Lightness, y = Hue, fill = Saturation)) +
  geom_boxplot() +
  labs(title = "Boxplot of Hue by Lightness and Saturation")

###Visualize the relationships between the continuous inputs, are they correlated?

Check correlation between continuous input variables.

df_eda %>% select(all_of(sel_num)) %>% select(-response) %>%
  cor() %>%
  corrplot::corrplot(type="upper", method="color", diag=F)

#Visualize the relationships between the continuous outputs (response and the LOGIT-transformed response, y) with respect to the continuous INPUTS.

Examine the response with regard to every continuous input.

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
scatterplot <- plot_ly(data = df_eda, type = "scatter", mode = "markers", alpha = 0.7) %>%
  add_trace(x = ~R, y = ~response, name = "R") %>%
  add_trace(x = ~G, y = ~response, name = "G") %>%
  add_trace(x = ~B, y = ~response, name = "B") %>%
  add_trace(x = ~Hue, y = ~response, name = "Hue") %>% 
  plotly::layout(title = "Scatterplot Matrix of Continuous Inputs vs. Response",
           showlegend = TRUE,
           legend = list(x = 0.5, y = 1, traceorder = "normal", font = list(family = "sans-serif", size = 12, color = "black")))

# Show the plot
scatterplot

Examine the log-transformed response with regard to every continuous input.

scatterplot <- plot_ly(data = df_eda, type = "scatter", mode = "markers", alpha = 0.7) %>%
  add_trace(x = ~R, y = ~log_response, name = "R") %>%
  add_trace(x = ~G, y = ~log_response, name = "G") %>%
  add_trace(x = ~B, y = ~log_response, name = "B") %>%
  add_trace(x = ~Hue, y = ~log_response, name = "Hue") %>% 
  plotly::layout(title = "Scatterplot Matrix of Continuous Inputs vs. Response",
           showlegend = TRUE,
           legend = list(x = 0.5, y = 1, traceorder = "normal", font = list(family = "sans-serif", size = 12, color = "black")))

# Show the plot
scatterplot

###Visualize the behavior of the binary outcome with respect to the continuous inputs

df_long <- gather(df_eda, key = "variable", value = "value", R, G, B, Hue)
ggplot(df_long, aes(x = value, y = outcome, color = outcome)) +
  geom_point() +
  facet_wrap(~variable, scales = "free") +
  labs(title = "Scatter Plots of Continuous Inputs vs. Binary Outcome",
       x = "Variable",
       y = "Binary Outcome")

#visualize the behavior of the binary outcome with respect to the categorical INPUTS?

df_long <- gather(df_eda, key = "variable", value = "value", Lightness,Saturation)
ggplot(df_long, aes(x = value, y = outcome, color = outcome)) +
  geom_point() +
  facet_wrap(~variable, scales = "free") +
  labs(title = "Scatter Plots of Continuous Inputs vs. Binary Outcome",
       x = "Variable",
       y = "Binary Outcome")